Now that we have seen most of the basics, let’s start the fun stuff !
There are two main ways to plot data in R:
ggplot2 and tidy data framesggplot2 is extremely powerful and some people advise not even teaching base graphics to beginners. But I find that some times it’s just quicker/easier with base graphics, so I will still present it, although not in full details.
x <- seq(-3*pi,3*pi,length=50)
y <- sin(x)
z <- sin(x)^2
df <- data.frame(x=x, y=y)
plot(x,y) # plot providing x and y data
plot(df) # plot providing a two-columns data.frame
plot(df, type="l")
plot(df, type="b")
df <- data.frame(x=x, y=y, z=z, w=z*y)
plot(df) # plot providing a multi-columns data.frame
OK, easy. Now let’s do some tuning of this, because it’s a tad ugly… Type in each command and see what they do.
# create some fake data
x <- seq(-3*pi,3*pi,length=100)
df <- data.frame(x=x, y=sin(x), z=sin(x)^2)
# add some styling parameters
par(family = "Helvetica", cex.lab=1.5, cex.axis=1.4,
mgp = c(2.4, .5, 0), tck=0.02, mar=c(4, 4, 2, .5), lwd=2, las=1)
plot(df$x,df$y,
type = "l", # "l" for lines, "p" for points
xlab = "X values",
ylab = "Intensity",
axes = FALSE,
main = "Some Plot",
ylim = c(-1,2)
)
# vertical line in 0
abline(v=0,lty=2,lwd=2)
# horizontal line in 0
abline(h=0,lty=3,lwd=2)
# line with coefficients a (intercept) and b (slope)
abline(a=0,b=.1,lty=4,lwd=1)
# add a line
lines(df$x,df$z,type = "l",col="red",lwd=3)
# add points
points(df$x,df$z*df$y,col="royalblue",pch=16,cex=1)
# add custom axis.
# Default with axis(1);axis(2);axis(3, labels=FALSE);axis(4, labels=FALSE);
# Bottom
axis(1,at=seq(-10,10,2),labels=TRUE,tck=0.02)
axis(1,at=seq(-10,10,1),labels=FALSE,tck=0.01); # small inter-ticks
# Top
axis(3,at=seq(-10,10,2),labels=FALSE)
axis(3,at=seq(-10,10,1),labels=FALSE,tck=0.01); # small inter-ticks
# Left
axis(2,at=seq(-1,1,.2),labels=TRUE)
axis(2,at=seq(-1,1,.1),labels=FALSE,tck=0.01); # small inter-ticks
# Right
axis(4,at=seq(-1,1,.2),labels=FALSE)
axis(4,at=seq(-1,1,.1),labels=FALSE,tck=0.01); # small inter-ticks
# Draw a box
box()
# Print legend
legend("topleft",
cex=1.4, #size of text
lty=c(1,1,NA), # type of line (1 is full, 2 is dashed...)
lwd=c(1,3,NA), # line width
pch=c(NA,NA,16), # type of points
col=c("black","red","royalblue"), # color
bty = "n", # no box around legend
legend=c("sin(x)",expression("sin(x)"^2),expression("sin(x)"^3))
)
Most needs should be covered with this simple plot that can be adapted.
A plot can be exported if surrounded by XXX and dev.off(), with XXX that can be pdf("xxx.pdf",height=6, width=8), png("xxx.png",height=6, width=8)… Example:
pdf("test.pdf",height=6, width=8)
plot(x,y,
type="l",
xlab="x"
)
dev.off()
You can also export the graph as a .tex file using tikz(), which allows you to use \(\LaTeX\) mathematical expressions (don’t forget to escape the \ character):
library(tikzDevice)
tikz("plot.tex",height=6, width=8,pointsize = 10,standAlone=TRUE)
plot(x,y,
type="l",
xlab="\\omega_i"
)
dev.off()
Go to Rstudio Preferences, Code, Edit code snippets, and add the following lines:
snippet plot
#pdf("xxx.pdf", height=6, width=8)
par(cex.lab=1.7, cex.axis=1.7, mgp = c(3, 0.9, 0),
tck=0.02, mar=c(4.5, 4.5, 1, 1), lwd = 3, las=1)
plot(${1:x},${2:y},
type="l", # plot with a line
ylim=c( , ),
xlim=c( , ),
lwd=2, # width of the line
lty=1, # type of line
axes=FALSE, # do not show axes
xlab="${1:x}", # x label
ylab="${2:y}", # y label
main="") # Title
legend("topright",
cex=1.5, # size of the text
pch=c(), # list of point types
lty=c(), # list of line types
lwd=c(), # list of line widths
col=c(), # list of line colors
bty="n", # no box around the legend
legend=c() # list of legend labels
)
# Draw axes with minor ticks
axis(1, at=seq(0,1,.2), labels=TRUE)
axis(1, at=seq(0,1,.1), labels=FALSE, tck=0.01)
axis(3, at=seq(0,1,.2), labels=FALSE)
axis(3, at=seq(0,1,.1), labels=FALSE, tck=0.01)
par(mgp = c(2.5, 0.2, 0))
axis(2, at=seq(0,10,1), labels=TRUE)
axis(2, at=seq(0,10,.5), labels=FALSE, tck=0.01)
axis(4, at=seq(0,10,1), labels=FALSE)
axis(4, at=seq(0,10,.5), labels=FALSE, tck=0.01)
box() # drow box around plot
#dev.off()
snippet ggplot
library(ggplot2)
ggplot(data=${1:df}, aes(x=${2:x}, y=${3:y}, color=${4:z}, size=${5:w})) +
geom_point() +
geom_smooth() +
theme_bw()
Try reproducing these plots:
Lets create a plot with different panels (a bit ugly without styling, you need to tweak the margins and text distance to plot with par(mar(), mgp()) before each plot):
# some fake data
x <- seq(-10,10,1)
d1 <- data.frame(x=x, y=sin(x))
d2 <- data.frame(x=x, y=cos(x))
d3 <- data.frame(x=x, y=exp(-x^2)*sin(x)^2)
# on a simple grid, use:
# par(mfrow=c(nrows, ncols))
par(mfrow=c(1, 3), mar=c(4,4,1,1))
plot(d1,type="l")
plot(d2,type="p")
plot(d3,type="b")
# creating the layout and styling
M <- matrix(c(c(1,1),c(2,3)), byrow=TRUE, ncol=2); M
## [,1] [,2]
## [1,] 1 1
## [2,] 2 3
nf <- layout(M, heights=c(1), widths=c(1))
# first plot
plot(d1,type="l")
# second plot
plot(d2,type="p")
# third plot
plot(d3,type="b")
# creating the layout and styling
M <- matrix(c(c(1,1),c(2,3)), byrow=FALSE, ncol=2); M
## [,1] [,2]
## [1,] 1 2
## [2,] 1 3
nf <- layout(M, heights=c(1), widths=c(1))
# first plot
plot(d1,type="l")
# second plot
plot(d2,type="p")
# third plot
plot(d3,type="b")
x <- rnorm(1e4, mean = 0, sd = 1)
# Barplot
hist(x)
# Density
y <- density(x, bw=0.1) # small kernel bandwidth
y2 <- density(x, bw=0.5) # larger kernel bandwidth
plot(y, lwd=2, main="", xlab="X values", xlim=c(-4,4))
lines(y2,col="red",lwd=2)
points(x, jitter(rep(.01,length(x)), amount=.01),
cex=1,pch=16, col=adjustcolor("royalblue", alpha=.01))
Plot the histogram of the age distribution in the class.
Further reading here and on the cheatsheet for example.
ggplot2 is a package (now even available for python) that completely changes the methodology of plotting data. With ggplot2, data are gathered in a tidy data.frame, and each column can be used as a parameter to tweak colors, point size, etc.
First things first, load the library:
library(ggplot2)
library(tidyverse) # for easier data manipulation
A quick plot can be made using the function qplot (for “quickplot”), very similar to the base graphics plot function. It’s great for allowing you to produce plots quickly, but it is highly recommended learning ggplot() as it makes it easier to create complex graphics.
Note how different is the default theme of the plot:
# some fake data
x <- seq(-10,10,.5)
y <- sin(x)
p1 <- qplot(x,y, geom="line") # ggplot2 quick plot
p2 <- qplot(x,y, geom="point") # ggplot2 quick plot
p1; p2
With ggplot2 is introduced the notion of “grammar of graphics” through the function ggplot(). What it means is that the plots are built through independent blocks that can be combined to create any wanted graphical display. To construct a plot, you need to provide building blocks such as:
Since a figure is worth a thousand words, let’s get to it. We will use the dataset diamonds built-in with the ggplot2 package. Let’s have a look:
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # … with 53,930 more rows
diamonds contains 53940 lines and 10 columns in a tibble. ggplot2 can easily handle such large dataset.
Let’s say we want to see whether there is a correlation between price and weight (carat) of the diamonds:
p <- ggplot(data=diamonds,aes(x=carat,y=price))
p + geom_point()
OK, we’re onto something, but we can probably add some information to this plot. We will first cut the data above 3 carats because they are not relevant, and add some transparency to the points to see some statistical information.
p <- diamonds %>% filter(carat<=3) %>%
ggplot(aes(x=carat,y=price))
p + geom_point(alpha=0.5)
Let’s now see whether the clarity plays a role here by coloring the points according to the diamonds cut:
p <- diamonds %>% filter(carat<=3) %>%
ggplot(aes(x=carat,y=price, color=cut))
p + geom_point(alpha=0.5)
It looks like the price dispersion is homogeneous, we can make sure by adding a spline smoothing:
p + geom_point(alpha=0.5) + geom_smooth()
The slope evolution shows that in general, the better the cut, the higher the price. But there are some discrepancies that may be explained in another manner:
p <- diamonds %>% filter(carat<=3) %>%
ggplot(aes(x=carat,y=price, color=clarity))
p + geom_point(alpha=0.5) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It is often easier to grasp a multi-variable problem by plotting all our data in a facet plot using facet_wrap():
colors <- rainbow(length(unique(diamonds$clarity)))
p <- ggplot(diamonds, aes(x=price,y=carat)) +
geom_point(aes(color=clarity), alpha=0.5, size=1) +
geom_smooth(color="black") +
scale_colour_manual(values = colors, name="Clarity") +
facet_wrap(~cut)
p
Or by adding another graphical parameter such as the size of the points:
p <- ggplot(diamonds, aes(x=price,y=carat, size=cut)) +
geom_point(aes(color=clarity), alpha=0.5) +
scale_colour_manual(values = colors, name="Clarity")
p
OK, maybe not here because the graph gets clogged, so we can lighten it by sampling data:
p <- ggplot(diamonds[sample(nrow(diamonds), size=500),],
aes(x=carat,y=price, size=cut)) +
geom_point(aes(color=clarity), alpha=0.5) +
scale_colour_manual(values = colors, name="Clarity")
p
It is very easy to keep the same theme on all your graphs thanks to the theme() function. There are a collection of pre-defined themes, like:
You can define all the parameters you want, like this (hit ?theme like usual to see all the parameters):
my_theme <- theme_bw()+
theme(text = element_text(size = 18, family = "Times", face="bold"),
axis.ticks = element_line(size=1),
legend.text = element_text(size = 14, family = "Times"),
panel.border = element_rect(size=2),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
p + my_theme
Thanks to the plotly package, it is really easy to transform a ggplot plot into an interactive plot:
# load plotly
library(plotly)
p <- ggplot(diamonds[sample(nrow(diamonds), size=100),],
aes(x=carat,y=price)) +
geom_point(aes(color=clarity), alpha=0.5, size=2) +
my_theme
ggplotly(p, dynamicTicks = TRUE)
If you have several plot you want to gather on a grid and you can’t use facet_wrap() (because they come from different data sets), you can use the library patchwork:
library(patchwork)
library(ggplot2)
theme_set(theme_bw())
x <- seq(-2*pi,2*pi,.1)
p1 <- qplot(x,sin(x), geom="line")
p2 <- qplot(x,cos(x), geom="line")
p3 <- qplot(x,atan(x), geom="line")
p4 <- qplot(x,dnorm(x), geom="line")
p1 + p2
p1 + p2 / p3 + p4 +
plot_annotation(tag_levels = 'a', tag_suffix=")")
(p1 + p2 + plot_layout(widths = c(1,3))) /
p3/p4 +
plot_layout(heights = c(6, 2, 1))
If you want to make an inset plot, first make your plots, then add a plot within the other using annotation_custom(), specifying the x and y positions of the 4 corners of the inset plot:
library(ggplot2)
p4 + annotation_custom(ggplotGrob(p3),
xmin = -6.5, xmax = -1.5,
ymin = .2, ymax = .4)
tibbledata.frame in another single tidy one: it should have three columns, w, Intensity and file_namenorm01() that, given a vector, returns the same vector normalized to [0,1]group_by() and mutate(), add a column norm_int of normalized intensity for each file# Load them in two separate `tibbles`
library(tidyverse)
# Using read.table (who returns a data.frame)
df1 <- read.table("Data/PPC60_G_01.txt", col.names=c("w","Intensity"))
df2 <- read.table("Data/PPC60_G_30.txt", col.names=c("w","Intensity"))
df1 <- tibble(df1) # make a tibble from a data.frame
df2 <- tibble(df2)
# Direct version using tidyverse (read_table2 returns a tibble)
df1 <- read_table2("Data/PPC60_G_01.txt", col_names=c("w","Intensity"))
df2 <- read_table2("Data/PPC60_G_30.txt", col_names=c("w","Intensity"))
# Gather the two `tibbles` in another single tidy one:
# it should have three columns, `w`, `Intensity` and `file_name`
df1$file_name <- "PPC60_G_01" # add the "file_name" column
df2$file_name <- "PPC60_G_30"
df_tidy <- bind_rows(df1,df2) # stack the two tibbles
# Create a function `norm01` that, given a vector, returns the same vector normalized to [0,1]
norm01 <- function(x) {
(x-min(x))/(max(x)-min(x))
}
# Using `group_by` and `mutate`, add a column `norm_int` in df_tidy of normalized intensity for each file
df_tidy <- df_tidy %>%
group_by(file_name) %>%
mutate(norm_int=norm01(Intensity))
# Plot the two normalized spectra on the same graph using lines of different colors
library(ggplot2)
df_tidy %>%
ggplot(aes(x=w, y=norm_int, color=file_name))+
geom_line()+
theme_bw()
# Play with the theme and parameters to reproduce the following plot:
df_tidy %>%
ggplot(aes(x=w, y=norm_int, color=file_name)) +
geom_line()+
scale_color_manual(values=c("red","royalblue"), name="") +
labs(x="Raman Shift [1/cm]", y="Intensity [arb. unit]") +
theme_bw() +
theme(legend.position = "top",
text = element_text(size = 14,family = "Times"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
tibble.factor() function:x <- c("a","a","b","c","a")
factor(x)
## [1] a a b c a
## Levels: a b c
as.numeric(factor(x))
## [1] 1 1 2 3 1
annotate("text", x, y, label)library(tidyverse)
library(ggplot2)
df <- tibble()
norm01 <- function(x) {
(x-min(x))/(max(x)-min(x))
}
for (i in 1:4) {
d <- read_table2(paste0("Data/rubis_0",i,".txt"), col_names=c("w", "Int"))
d$Int_n <- norm01(d$Int)
d$name <- paste0("rubis_0",i)
df <- bind_rows(df, d)
}
fnames <- unique(df$name)
ggplot(data=df, aes(x=w,
y=Int_n+as.numeric(factor(name))-1,
color=name))+
geom_line(size=1)+
annotate("text", x=3080, y=1:length(fnames)-.85, label=fnames, size=5)+
labs(x="Raman Shift [1/cm]", y="Intensity [arb. units]")+
theme_bw()+
theme(legend.position = "none",
text = element_text(size = 14),
axis.text.y = element_blank(),
axis.text = element_text(size = 14))
Bonus:
frame option here)akima package and its interp() function. See chapter @ref(colorplots) on 3D plotting for help.# get the list of files for the ramps up and down and out of cell
files_up <- list.files("Data/dataG/", pattern="up")
files_down <- list.files("Data/dataG/", pattern="down")
out_cell <- list.files("Data/dataG/", pattern="out")
# store all file names in the correct order
allfiles <- c(out_cell, files_up, files_down)
# load the wanted package
library(tidyverse)
library(ggplot2)
# create the norm01 function
norm01 <- function(x) { (x-min(x))/(max(x)-min(x)) }
# initialize an empty tibble to store all data
alldata <- tibble()
for (file in allfiles) {#file <- allfiles[1]
# read the data and stor it in d
d <- read_table2(paste0("Data/dataG/",file), col_names=TRUE)
# normalize data
d$Int_n <- norm01(d$Int)
# store file name
d$name <- file
# store run number for the stacking
d$run_number <- which(file==allfiles)
# store all data in a single tidy tibble
alldata <- bind_rows(alldata, d)
}
# plot all data
alldata %>%
filter(w<=1750, w>=1500) %>% # zoom on the interesting part
ggplot(aes(x=w,
y=Int_n + run_number - 1))+ # to stack the plots
geom_point(color="gray", alpha=.5, size=.2)+ #plot data with points
xlim(c(1500,1800))+ #add some white space on the right to write the pressure
geom_vline(xintercept=1592, lty=2, size=1)+#show a vertical line
annotate(geom = "text", size=5, #show the pressure values
x = 1760, y=seq_along(allfiles)-1, hjust = 0,
label = paste(unique(alldata$P),"GPa"),
family = "Times")+
labs(x="Raman Shift [1/cm]", #have the good axis labels
y="Intensity [arb. units]")+
theme_bw()+#black and white theme
theme(legend.position = "none",#no legend
text = element_text(size = 14, family = "Times"),#text in font Times
axis.text.y = element_blank(),# no y axis values
axis.text = element_text(size = 14),
panel.grid.major = element_blank(), # no grid
panel.grid.minor = element_blank())
# Looking at data for increasing pressures, plot the data using an interactive slider
library(plotly)
P <- alldata %>%
filter(grepl("up",name)) %>% # only increasing pressures
filter(w<=1850, w>=1500) %>% # zoom on the interesting part
ggplot(aes(x=w,
y=Int_n,
frame=P))+ # each pressure in a new frame
geom_point(color="gray", alpha=.5, size=1)+ #plot data with points
labs(x="Raman Shift [1/cm]", #have the good axis labels
y="Intensity [arb. units]")+
theme_bw()+#black and white theme
theme(legend.position = "none",#no legend
text = element_text(size = 14, family = "Times"),
axis.text = element_text(size = 14),
panel.grid.major = element_blank(), # no grid
panel.grid.minor = element_blank())
ggplotly(P, dynamicTicks = TRUE)
# Plot the data using a 3D color map. Since the data are not on a regular grid,
# you will need to interpolate the data on a regular grid
# with the `akima` package and its `interp()` function
library(akima)
toplot <- alldata %>%
filter(grepl("up",name)) %>%
filter(w<=1850, w>=1500)
toplot.interp <- with(toplot,
interp(x = w, y = P, z = Int_n,
duplicate="median",
xo=seq(min(toplot$w), max(toplot$w), length = 100),
yo=seq(min(toplot$P), max(toplot$P), length = 100),
extrap=FALSE, linear=FALSE)
)
# toplot.interp is a list of 2 vectors and a matrix
str(toplot.interp)
## List of 3
## $ x: num [1:100] 1500 1504 1507 1511 1514 ...
## $ y: num [1:100] 2.12 2.35 2.59 2.82 3.06 ...
## $ z: num [1:100, 1:100] NA NA NA 0.0287 0.0291 ...
# Regrouping this list to a 3-columns data.frame
melt_x <- rep(toplot.interp$x, times=length(toplot.interp$y))
melt_y <- rep(toplot.interp$y, each=length(toplot.interp$x))
melt_z <- as.vector(toplot.interp$z)
toplot.smooth <- na.omit(data.frame(w=melt_x, Pressure=melt_y, Intensity=melt_z))
# Plotting
colors <- colorRampPalette(c("white","royalblue","seagreen","orange","red","brown"))(500)
P <- ggplot(data=toplot.smooth, aes(x=w, y=Pressure, fill=Intensity)) +
geom_raster() +
scale_fill_gradientn(colors=colors, name="Normalized\nIntensity\n[arb. units]") +
labs(x = "Raman Shift [1/cm]",y="Pressure [GPa]") +
theme_bw()+
theme(text = element_text(size = 14, family = "Times"),
axis.text = element_text(size = 14),
panel.grid.major = element_blank(), # no grid
panel.grid.minor = element_blank())
ggplotly(P, dynamicTicks = TRUE)
data.framedata.frame?
data.frame?reorder() and this help to use it with facets):# Load and tidy population data.frame
library(tidyverse)
df <- read.csv("Data/population.csv")
df <- pivot_longer(df,
cols=-year, #year should stay a column
names_to="City", #column names should go to the column `city`
values_to="Population" #values should go to the column `population`
)
df$City <- gsub("\\.", " ", df$City) # replace dots by spaces in city names
# Plot the population vs. year with a different color for each city
p <- ggplot(data=df, aes(x=year, y=Population, color=City))
# With points
p + geom_point()
# With lines
p + geom_line()
# With a black and white theme
# Change the axis labels to "year" and "Population"
p <- p + geom_line() + theme_bw(); p
# Make it interactive
library(plotly)
ggplotly(p, dynamicTicks = TRUE)
# Reproduce the plots
my_theme <- theme_bw()+
theme(axis.text = element_text(size = 14,family = "Helvetica",colour="black"),
text = element_text(size = 14,family = "Helvetica"),
axis.ticks = element_line(colour = "black"),
legend.text = element_text(size = 10,family = "Helvetica",colour="black"),
panel.border = element_rect(colour = "black", fill=NA, size=1)
)
colors <- c("royalblue","red")
p1 <- df %>% filter(City%in%c("Montpellier","Nantes")) %>%
ggplot(aes(x=year, y=Population, size=Population, color=City)) +
geom_point() +
geom_smooth(method="lm", aes(fill=City),
alpha=0.1, show.legend = FALSE) +
scale_color_manual(values=colors)+
scale_fill_manual(values=colors)+
ggtitle("Population in Montpellier and Nantes")+
labs(x="Year", y="Population")+
my_theme
p1
p2 <- df %>% filter(year==2012) %>%
ggplot(aes(x=reorder(City,-Population),
y=Population/1e6,
fill=Population/1e6)) +
geom_bar(stat="identity", position="dodge") +
ggtitle("Population in 2012 (in millions)")+
labs(x="", y="Population (in millions)")+
scale_fill_gradientn(colors=colors,
name="Population\n(in millions)") +
my_theme +
theme(axis.text.x = element_text(angle = 45, hjust=1))
p2